In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
In [5]:
# Installation and Setup
# =============================================================================

!pip install google-genai pandas numpy scikit-learn matplotlib seaborn
!pip install great-expectations dash plotly bokeh openpyxl xmltodict
!pip install google-cloud-bigquery google-cloud-storage
Requirement already satisfied: google-genai in /usr/local/lib/python3.12/dist-packages (1.51.0)
Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)
Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)
Requirement already satisfied: anyio<5.0.0,>=4.8.0 in /usr/local/lib/python3.12/dist-packages (from google-genai) (4.11.0)
Requirement already satisfied: google-auth<3.0.0,>=2.14.1 in /usr/local/lib/python3.12/dist-packages (from google-genai) (2.38.0)
Requirement already satisfied: httpx<1.0.0,>=0.28.1 in /usr/local/lib/python3.12/dist-packages (from google-genai) (0.28.1)
Requirement already satisfied: pydantic<3.0.0,>=2.9.0 in /usr/local/lib/python3.12/dist-packages (from google-genai) (2.11.10)
Requirement already satisfied: requests<3.0.0,>=2.28.1 in /usr/local/lib/python3.12/dist-packages (from google-genai) (2.32.4)
Requirement already satisfied: tenacity<9.2.0,>=8.2.3 in /usr/local/lib/python3.12/dist-packages (from google-genai) (8.5.0)
Requirement already satisfied: websockets<15.1.0,>=13.0.0 in /usr/local/lib/python3.12/dist-packages (from google-genai) (15.0.1)
Requirement already satisfied: typing-extensions<5.0.0,>=4.11.0 in /usr/local/lib/python3.12/dist-packages (from google-genai) (4.15.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.5.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.60.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.4.9)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (25.0)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.2.5)
Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.12/dist-packages (from anyio<5.0.0,>=4.8.0->google-genai) (3.11)
Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.12/dist-packages (from anyio<5.0.0,>=4.8.0->google-genai) (1.3.1)
Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from google-auth<3.0.0,>=2.14.1->google-genai) (5.5.2)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.12/dist-packages (from google-auth<3.0.0,>=2.14.1->google-genai) (0.4.2)
Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.12/dist-packages (from google-auth<3.0.0,>=2.14.1->google-genai) (4.9.1)
Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0,>=0.28.1->google-genai) (2025.11.12)
Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0,>=0.28.1->google-genai) (1.0.9)
Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1.0.0,>=0.28.1->google-genai) (0.16.0)
Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.9.0->google-genai) (0.7.0)
Requirement already satisfied: pydantic-core==2.33.2 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.9.0->google-genai) (2.33.2)
Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.12/dist-packages (from pydantic<3.0.0,>=2.9.0->google-genai) (0.4.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.28.1->google-genai) (3.4.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.28.1->google-genai) (2.5.0)
Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /usr/local/lib/python3.12/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.0,>=2.14.1->google-genai) (0.6.1)
Collecting great-expectations
  Downloading great_expectations-1.9.1-py3-none-any.whl.metadata (9.2 kB)
Collecting dash
  Downloading dash-3.3.0-py3-none-any.whl.metadata (11 kB)
Requirement already satisfied: plotly in /usr/local/lib/python3.12/dist-packages (5.24.1)
Requirement already satisfied: bokeh in /usr/local/lib/python3.12/dist-packages (3.7.3)
Requirement already satisfied: openpyxl in /usr/local/lib/python3.12/dist-packages (3.1.5)
Collecting xmltodict
  Downloading xmltodict-1.0.2-py3-none-any.whl.metadata (15 kB)
Collecting altair<5.0.0,>=4.2.1 (from great-expectations)
  Downloading altair-4.2.2-py3-none-any.whl.metadata (13 kB)
Requirement already satisfied: cryptography>=3.2 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (43.0.3)
Requirement already satisfied: jinja2>=3 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (3.1.6)
Requirement already satisfied: jsonschema>=2.5.1 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (4.25.1)
Collecting marshmallow<4.0.0,>=3.7.1 (from great-expectations)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Requirement already satisfied: mistune>=0.8.4 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (3.1.4)
Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (2.0.2)
Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from great-expectations) (25.0)
Requirement already satisfied: pandas>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (2.2.2)
Requirement already satisfied: pydantic>=1.10.7 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (2.11.10)
Requirement already satisfied: pyparsing!=3.2.4,>=2.4 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (3.2.5)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (2.9.0.post0)
Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (2.32.4)
Collecting ruamel.yaml>=0.16 (from great-expectations)
  Downloading ruamel.yaml-0.18.16-py3-none-any.whl.metadata (25 kB)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (1.16.3)
Requirement already satisfied: tqdm>=4.59.0 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (4.67.1)
Requirement already satisfied: typing-extensions>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (4.15.0)
Requirement already satisfied: tzlocal>=1.2 in /usr/local/lib/python3.12/dist-packages (from great-expectations) (5.3.1)
Requirement already satisfied: Flask<3.2,>=1.0.4 in /usr/local/lib/python3.12/dist-packages (from dash) (3.1.2)
Requirement already satisfied: Werkzeug<3.2 in /usr/local/lib/python3.12/dist-packages (from dash) (3.1.3)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.12/dist-packages (from dash) (8.7.0)
Collecting retrying (from dash)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.12/dist-packages (from dash) (1.6.0)
Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from dash) (75.2.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.12/dist-packages (from plotly) (8.5.0)
Requirement already satisfied: contourpy>=1.2 in /usr/local/lib/python3.12/dist-packages (from bokeh) (1.3.3)
Requirement already satisfied: narwhals>=1.13 in /usr/local/lib/python3.12/dist-packages (from bokeh) (2.12.0)
Requirement already satisfied: pillow>=7.1.0 in /usr/local/lib/python3.12/dist-packages (from bokeh) (11.3.0)
Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.12/dist-packages (from bokeh) (6.0.3)
Requirement already satisfied: tornado>=6.2 in /usr/local/lib/python3.12/dist-packages (from bokeh) (6.5.1)
Requirement already satisfied: xyzservices>=2021.09.1 in /usr/local/lib/python3.12/dist-packages (from bokeh) (2025.10.0)
Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.12/dist-packages (from openpyxl) (2.0.0)
Requirement already satisfied: entrypoints in /usr/local/lib/python3.12/dist-packages (from altair<5.0.0,>=4.2.1->great-expectations) (0.4)
Requirement already satisfied: toolz in /usr/local/lib/python3.12/dist-packages (from altair<5.0.0,>=4.2.1->great-expectations) (0.12.1)
Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=3.2->great-expectations) (2.0.0)
Requirement already satisfied: blinker>=1.9.0 in /usr/local/lib/python3.12/dist-packages (from Flask<3.2,>=1.0.4->dash) (1.9.0)
Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.12/dist-packages (from Flask<3.2,>=1.0.4->dash) (8.3.1)
Requirement already satisfied: itsdangerous>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from Flask<3.2,>=1.0.4->dash) (2.2.0)
Requirement already satisfied: markupsafe>=2.1.1 in /usr/local/lib/python3.12/dist-packages (from Flask<3.2,>=1.0.4->dash) (3.0.3)
Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=2.5.1->great-expectations) (25.4.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=2.5.1->great-expectations) (2025.9.1)
Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=2.5.1->great-expectations) (0.37.0)
Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.12/dist-packages (from jsonschema>=2.5.1->great-expectations) (0.29.0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.3.0->great-expectations) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.3.0->great-expectations) (2025.2)
Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic>=1.10.7->great-expectations) (0.7.0)
Requirement already satisfied: pydantic-core==2.33.2 in /usr/local/lib/python3.12/dist-packages (from pydantic>=1.10.7->great-expectations) (2.33.2)
Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.12/dist-packages (from pydantic>=1.10.7->great-expectations) (0.4.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.1->great-expectations) (1.17.0)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.20->great-expectations) (3.4.4)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests>=2.20->great-expectations) (3.11)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.20->great-expectations) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests>=2.20->great-expectations) (2025.11.12)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.16->great-expectations)
  Downloading ruamel_yaml_clib-0.2.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.12/dist-packages (from importlib-metadata->dash) (3.23.0)
Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=3.2->great-expectations) (2.23)
Downloading great_expectations-1.9.1-py3-none-any.whl (4.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.9/4.9 MB 46.0 MB/s eta 0:00:00
Downloading dash-3.3.0-py3-none-any.whl (7.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.9/7.9 MB 103.2 MB/s eta 0:00:00
Downloading xmltodict-1.0.2-py3-none-any.whl (13 kB)
Downloading altair-4.2.2-py3-none-any.whl (813 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 813.6/813.6 kB 45.2 MB/s eta 0:00:00
Downloading marshmallow-3.26.1-py3-none-any.whl (50 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 50.9/50.9 kB 3.7 MB/s eta 0:00:00
Downloading ruamel.yaml-0.18.16-py3-none-any.whl (119 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 119.9/119.9 kB 9.3 MB/s eta 0:00:00
Downloading retrying-1.4.2-py3-none-any.whl (10 kB)
Downloading ruamel_yaml_clib-0.2.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (788 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 788.2/788.2 kB 42.1 MB/s eta 0:00:00
Installing collected packages: xmltodict, ruamel.yaml.clib, retrying, marshmallow, ruamel.yaml, dash, altair, great-expectations
  Attempting uninstall: altair
    Found existing installation: altair 5.5.0
    Uninstalling altair-5.5.0:
      Successfully uninstalled altair-5.5.0
Successfully installed altair-4.2.2 dash-3.3.0 great-expectations-1.9.1 marshmallow-3.26.1 retrying-1.4.2 ruamel.yaml-0.18.16 ruamel.yaml.clib-0.2.15 xmltodict-1.0.2
Requirement already satisfied: google-cloud-bigquery in /usr/local/lib/python3.12/dist-packages (3.38.0)
Requirement already satisfied: google-cloud-storage in /usr/local/lib/python3.12/dist-packages (2.19.0)
Requirement already satisfied: google-api-core<3.0.0,>=2.11.1 in /usr/local/lib/python3.12/dist-packages (from google-api-core[grpc]<3.0.0,>=2.11.1->google-cloud-bigquery) (2.28.1)
Requirement already satisfied: google-auth<3.0.0,>=2.14.1 in /usr/local/lib/python3.12/dist-packages (from google-cloud-bigquery) (2.38.0)
Requirement already satisfied: google-cloud-core<3.0.0,>=2.4.1 in /usr/local/lib/python3.12/dist-packages (from google-cloud-bigquery) (2.5.0)
Requirement already satisfied: google-resumable-media<3.0.0,>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from google-cloud-bigquery) (2.8.0)
Requirement already satisfied: packaging>=24.2.0 in /usr/local/lib/python3.12/dist-packages (from google-cloud-bigquery) (25.0)
Requirement already satisfied: python-dateutil<3.0.0,>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from google-cloud-bigquery) (2.9.0.post0)
Requirement already satisfied: requests<3.0.0,>=2.21.0 in /usr/local/lib/python3.12/dist-packages (from google-cloud-bigquery) (2.32.4)
Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.12/dist-packages (from google-cloud-storage) (1.7.1)
Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /usr/local/lib/python3.12/dist-packages (from google-api-core<3.0.0,>=2.11.1->google-api-core[grpc]<3.0.0,>=2.11.1->google-cloud-bigquery) (1.72.0)
Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.19.5 in /usr/local/lib/python3.12/dist-packages (from google-api-core<3.0.0,>=2.11.1->google-api-core[grpc]<3.0.0,>=2.11.1->google-cloud-bigquery) (5.29.5)
Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /usr/local/lib/python3.12/dist-packages (from google-api-core<3.0.0,>=2.11.1->google-api-core[grpc]<3.0.0,>=2.11.1->google-cloud-bigquery) (1.26.1)
Requirement already satisfied: grpcio<2.0.0,>=1.33.2 in /usr/local/lib/python3.12/dist-packages (from google-api-core[grpc]<3.0.0,>=2.11.1->google-cloud-bigquery) (1.76.0)
Requirement already satisfied: grpcio-status<2.0.0,>=1.33.2 in /usr/local/lib/python3.12/dist-packages (from google-api-core[grpc]<3.0.0,>=2.11.1->google-cloud-bigquery) (1.71.2)
Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from google-auth<3.0.0,>=2.14.1->google-cloud-bigquery) (5.5.2)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.12/dist-packages (from google-auth<3.0.0,>=2.14.1->google-cloud-bigquery) (0.4.2)
Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.12/dist-packages (from google-auth<3.0.0,>=2.14.1->google-cloud-bigquery) (4.9.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil<3.0.0,>=2.8.2->google-cloud-bigquery) (1.17.0)
Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.21.0->google-cloud-bigquery) (3.4.4)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.21.0->google-cloud-bigquery) (3.11)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.21.0->google-cloud-bigquery) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.21.0->google-cloud-bigquery) (2025.11.12)
Requirement already satisfied: typing-extensions~=4.12 in /usr/local/lib/python3.12/dist-packages (from grpcio<2.0.0,>=1.33.2->google-api-core[grpc]<3.0.0,>=2.11.1->google-cloud-bigquery) (4.15.0)
Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /usr/local/lib/python3.12/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.0,>=2.14.1->google-cloud-bigquery) (0.6.1)
In [1]:
    # pip install dash-bootstrap-components
In [7]:
# Import Dependencies
# =============================================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Any, Optional, Union
import warnings
import json
import os
warnings.filterwarnings('ignore')

import google.generativeai as genai
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import silhouette_score, r2_score
from sklearn.feature_selection import mutual_info_regression
import plotly.express as px
import plotly.graph_objects as go

# Dashboard imports
try:
    from dash import Dash, dcc, html, Input, Output
    import dash_bootstrap_components as dbc
    DASH_AVAILABLE = True
except ImportError:
    DASH_AVAILABLE = False
    print("Warning: Dash not available. Dashboard features will be disabled.")
In [8]:
# Configuration and API Setup
# =============================================================================

class Config:
    GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', "Your_Gemini_api")
    MODEL_NAME = 'gemini-2.0-flash-exp'
    MAX_MISSING_THRESHOLD = 0.5
    OUTLIER_THRESHOLD = 3
    N_CLUSTERS_DEFAULT = 3
    CORRELATION_THRESHOLD = 0.5
    FIGURE_SIZE = (12, 6)
    COLOR_PALETTE = 'viridis'
    DASH_PORT = 8050

    # GCP Configuration (optional)
    GCP_PROJECT_ID = os.getenv('GCP_PROJECT_ID', 'your_google_project ID')
    GCS_BUCKET = os.getenv('GCS_BUCKET', 'your-bucket-name')

genai.configure(api_key=Config.GEMINI_API_KEY)
In [9]:
# Base Agent Class
# =============================================================================

class BaseAgent:
    def __init__(self, name: str, model_name: str = Config.MODEL_NAME):
        self.name = name
        self.model_name = model_name
        self.model = genai.GenerativeModel(model_name)
        self.results = {}
        self.logs = []

    def log(self, message: str):
        log_entry = f"[{self.name}] {message}"
        self.logs.append(log_entry)
        print(log_entry)

    def save_result(self, key: str, value: Any):
        self.results[key] = value

    def get_llm_guidance(self, prompt: str) -> str:
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            self.log(f"LLM guidance failed: {e}")
            return ""
In [10]:
# Agent 1 - JSON/XML Normalizer Agent
# =============================================================================

class JSONXMLNormalizerAgent(BaseAgent):
    """Transforms semi-structured data into flat tabular formats"""

    def __init__(self):
        super().__init__("JSON/XML Normalizer")

    def normalize_json(self, data: Union[str, Dict, List]) -> pd.DataFrame:
        """Convert JSON to flat DataFrame"""
        self.log("Normalizing JSON data...")

        try:
            if isinstance(data, str):
                data = json.loads(data)

            # Handle nested structures
            df = pd.json_normalize(data)
            self.log(f"JSON normalized: {df.shape[0]} rows, {df.shape[1]} columns")
            return df

        except Exception as e:
            self.log(f"JSON normalization error: {e}")
            return pd.DataFrame()

    def normalize_xml(self, xml_string: str) -> pd.DataFrame:
        """Convert XML to flat DataFrame"""
        self.log("Normalizing XML data...")

        try:
            import xmltodict
            data_dict = xmltodict.parse(xml_string)
            df = pd.json_normalize(data_dict)
            self.log(f"XML normalized: {df.shape[0]} rows, {df.shape[1]} columns")
            return df

        except Exception as e:
            self.log(f"XML normalization error: {e}")
            return pd.DataFrame()

    def auto_detect_and_normalize(self, data: Any) -> pd.DataFrame:
        """Automatically detect format and normalize"""
        self.log("Auto-detecting data format...")

        if isinstance(data, pd.DataFrame):
            self.log("Data is already a DataFrame")
            return data

        if isinstance(data, str):
            # Try JSON first
            try:
                return self.normalize_json(data)
            except:
                pass

            # Try XML
            try:
                return self.normalize_xml(data)
            except:
                pass

        # Try dict or list
        if isinstance(data, (dict, list)):
            return self.normalize_json(data)

        self.log("Could not detect format, returning empty DataFrame")
        return pd.DataFrame()

    def flatten_nested_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """Flatten any remaining nested structures"""
        self.log("Flattening nested columns...")

        for col in df.columns:
            if df[col].apply(lambda x: isinstance(x, (dict, list))).any():
                try:
                    nested_df = pd.json_normalize(df[col].dropna())
                    nested_df.columns = [f"{col}.{subcol}" for subcol in nested_df.columns]
                    df = df.drop(columns=[col]).join(nested_df)
                except:
                    self.log(f"Could not flatten column: {col}")

        return df
In [11]:
# Agent 2 - Data Quality Agent
# =============================================================================

class DataQualityAgent(BaseAgent):
    """Implements data validation, cleaning, and quality checks"""

    def __init__(self):
        super().__init__("Data Quality")
        self.quality_report = {}
        self.label_encoders = {}

    def assess_data_quality(self, df: pd.DataFrame) -> Dict:
        self.log("Assessing data quality...")
        report = {
            'shape': df.shape,
            'columns': list(df.columns),
            'dtypes': df.dtypes.to_dict(),
            'missing_values': df.isnull().sum().to_dict(),
            'missing_percentage': (df.isnull().sum() / len(df) * 100).to_dict(),
            'duplicates': df.duplicated().sum(),
            'memory_usage': df.memory_usage(deep=True).sum() / 1024**2
        }
        self.quality_report = report
        self.log(f"Quality assessment complete: {report['shape'][0]} rows, {report['shape'][1]} columns")
        return report

    def identify_data_types(self, df: pd.DataFrame) -> Dict:
        """Identify and categorize data types"""
        self.log("Identifying data types...")
        dtypes_info = {
            'numerical': list(df.select_dtypes(include=[np.number]).columns),
            'categorical': list(df.select_dtypes(include=['object', 'category']).columns),
            'datetime': list(df.select_dtypes(include=['datetime64']).columns),
            'all_dtypes': df.dtypes.to_dict()
        }
        self.log(f"Found {len(dtypes_info['numerical'])} numerical, {len(dtypes_info['categorical'])} categorical columns")
        return dtypes_info

    def encode_categorical_features(self, df: pd.DataFrame, method: str = 'label') -> pd.DataFrame:
        """Encode categorical features for ML algorithms"""
        self.log(f"Encoding categorical features using {method} encoding...")
        df_encoded = df.copy()
        categorical_cols = df_encoded.select_dtypes(include=['object', 'category']).columns

        if len(categorical_cols) == 0:
            self.log("No categorical columns to encode")
            return df_encoded

        from sklearn.preprocessing import LabelEncoder

        if method == 'label':
            for col in categorical_cols:
                le = LabelEncoder()
                # Handle NaN values
                mask = df_encoded[col].notna()
                df_encoded.loc[mask, col + '_encoded'] = le.fit_transform(df_encoded.loc[mask, col])
                self.label_encoders[col] = le
                self.log(f"Encoded {col}: {df_encoded[col].nunique()} unique values")

        elif method == 'onehot':
            df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, prefix=categorical_cols)
            self.log(f"One-hot encoded {len(categorical_cols)} categorical columns")

        self.log(f"Categorical encoding complete. New shape: {df_encoded.shape}")
        return df_encoded

    def handle_missing_values(self, df: pd.DataFrame, strategy: str = 'smart') -> pd.DataFrame:
        self.log(f"Handling missing values with strategy: {strategy}")
        df_clean = df.copy()

        for col in df_clean.columns:
            missing_pct = df_clean[col].isnull().sum() / len(df_clean)

            if missing_pct > Config.MAX_MISSING_THRESHOLD:
                self.log(f"Dropping column {col}: {missing_pct*100:.1f}% missing")
                df_clean = df_clean.drop(columns=[col])
                continue

            if df_clean[col].dtype in ['float64', 'int64']:
                if strategy == 'mean':
                    df_clean[col].fillna(df_clean[col].mean(), inplace=True)
                elif strategy == 'median':
                    df_clean[col].fillna(df_clean[col].median(), inplace=True)
                else:
                    df_clean[col].fillna(df_clean[col].median(), inplace=True)
            else:
                df_clean[col].fillna(df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'Unknown', inplace=True)

        self.log(f"Missing values handled: {df_clean.shape[1]} columns remaining")
        return df_clean

    def handle_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
        self.log("Handling duplicate rows...")
        initial_rows = len(df)
        df_clean = df.drop_duplicates().copy()
        dropped_rows = initial_rows - len(df_clean)
        self.log(f"Dropped {dropped_rows} duplicate rows")
        return df_clean

    def detect_outliers(self, df: pd.DataFrame, method: str = 'zscore') -> Dict:
        self.log(f"Detecting outliers using {method} method...")
        outlier_report = {}
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        for col in numeric_cols:
            if method == 'zscore':
                z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
                outliers = z_scores > Config.OUTLIER_THRESHOLD
                outlier_report[col] = {
                    'count': outliers.sum(),
                    'percentage': (outliers.sum() / len(df)) * 100
                }

        self.log(f"Outlier detection complete for {len(numeric_cols)} columns")
        return outlier_report

    def normalize_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Normalize numerical features using StandardScaler"""
        self.log("Normalizing numerical features...")
        df_normalized = df.copy()
        numeric_cols = df_normalized.select_dtypes(include=[np.number]).columns

        if len(numeric_cols) == 0:
            self.log("No numerical columns to normalize")
            return df_normalized

        scaler = StandardScaler()
        df_normalized[numeric_cols] = scaler.fit_transform(df_normalized[numeric_cols])

        self.log(f"Normalized {len(numeric_cols)} numerical columns")
        return df_normalized

    def validate_with_great_expectations(self, df: pd.DataFrame) -> Dict:
        """Validate data using Great Expectations"""
        self.log("Running Great Expectations validation...")

        try:
            validation_results = {
                'columns_exist': all(col in df.columns for col in df.columns),
                'no_null_in_required': df.notnull().all().to_dict(),
                'valid_dtypes': df.dtypes.to_dict()
            }

            self.log("Great Expectations validation complete")
            return validation_results

        except Exception as e:
            self.log(f"Great Expectations validation error: {e}")
            return {'error': str(e)}
In [12]:
# Agent 3 - Data Exploration Agent
# =============================================================================

class DataExplorationAgent(BaseAgent):
    """Generates summary statistics, distributions, and clustering"""

    def __init__(self):
        super().__init__("Data Exploration")

    def generate_summary_statistics(self, df: pd.DataFrame) -> Dict:
        self.log("Generating summary statistics...")
        summary = {
            'numerical_summary': df.describe().to_dict(),
            'categorical_summary': {},
            'data_types': df.dtypes.to_dict(),
            'unique_counts': df.nunique().to_dict()
        }

        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        for col in categorical_cols:
            summary['categorical_summary'][col] = {
                'unique_values': df[col].nunique(),
                'top_values': df[col].value_counts().head(10).to_dict()
            }

        self.save_result('summary_statistics', summary)
        self.log("Summary statistics generated")
        return summary

    def identify_outliers(self, df: pd.DataFrame, method: str = 'zscore') -> Dict:
        """Identify outliers in numerical columns"""
        self.log(f"Identifying outliers using {method} method...")
        outlier_report = {}
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        for col in numeric_cols:
            if method == 'zscore':
                z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
                outliers = z_scores > Config.OUTLIER_THRESHOLD
                outlier_report[col] = {
                    'count': int(outliers.sum()),
                    'percentage': float((outliers.sum() / len(df)) * 100),
                    'outlier_values': df[outliers][col].tolist()[:10]  # First 10 outliers
                }
            elif method == 'iqr':
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                outliers = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
                outlier_report[col] = {
                    'count': int(outliers.sum()),
                    'percentage': float((outliers.sum() / len(df)) * 100)
                }

        self.log(f"Outlier identification complete for {len(numeric_cols)} columns")
        return outlier_report

    def plot_distributions(self, df: pd.DataFrame, save_path: str = None):
        self.log("Plotting distributions...")
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        n_cols = len(numeric_cols)

        if n_cols == 0:
            self.log("No numerical columns to plot")
            return

        n_rows = (n_cols + 2) // 3
        fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
        axes = axes.flatten() if n_cols > 1 else [axes]

        for idx, col in enumerate(numeric_cols):
            axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
            axes[idx].set_title(f'Distribution of {col}')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')

        for idx in range(n_cols, len(axes)):
            axes[idx].axis('off')

        plt.tight_layout()
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

    def plot_countplots(self, df: pd.DataFrame):
        self.log("Plotting count plots...")
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns

        if len(categorical_cols) == 0:
            self.log("No categorical columns to plot")
            return

        for col in categorical_cols:
            plt.figure(figsize=(10, 6))
            sns.countplot(y=df[col], order=df[col].value_counts().index, palette=Config.COLOR_PALETTE)
            plt.title(f'Count Plot of {col}')
            plt.tight_layout()
            plt.show()

    def plot_boxplots(self, df: pd.DataFrame):
        self.log("Plotting box plots...")
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        if len(numeric_cols) == 0:
            self.log("No numerical columns to plot")
            return

        for col in numeric_cols:
            fig = px.box(df, y=col, title=f'Box Plot of {col}')
            fig.show()

    def perform_clustering(self, df: pd.DataFrame, n_clusters: int = None) -> Dict:
        self.log("Performing K-means clustering...")
        numeric_df = df.select_dtypes(include=[np.number]).dropna()

        if numeric_df.shape[1] < 2:
            self.log("Not enough features for clustering")
            return {}

        if n_clusters is None:
            silhouette_scores = []
            K_range = range(2, min(10, len(numeric_df)//2))

            for k in K_range:
                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
                labels = kmeans.fit_predict(numeric_df)
                score = silhouette_score(numeric_df, labels)
                silhouette_scores.append(score)

            n_clusters = K_range[np.argmax(silhouette_scores)]
            self.log(f"Optimal number of clusters: {n_clusters}")

        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = kmeans.fit_predict(numeric_df)

        pca = PCA(n_components=2)
        pca_features = pca.fit_transform(numeric_df)

        plt.figure(figsize=Config.FIGURE_SIZE)
        scatter = plt.scatter(pca_features[:, 0], pca_features[:, 1],
                            c=labels, cmap=Config.COLOR_PALETTE, alpha=0.6)
        plt.colorbar(scatter, label='Cluster')
        plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        plt.title('K-Means Clustering (PCA Projection)')
        plt.tight_layout()
        plt.show()

        clustering_results = {
            'n_clusters': n_clusters,
            'labels': labels.tolist(),
            'silhouette_score': float(silhouette_score(numeric_df, labels))
        }

        self.save_result('clustering', clustering_results)
        return clustering_results

    def create_scatter_matrix(self, df: pd.DataFrame, max_cols: int = 5):
        """Create scatter plot matrix for numerical columns"""
        self.log("Creating scatter matrix...")

        numeric_cols = df.select_dtypes(include=[np.number]).columns[:max_cols]

        if len(numeric_cols) < 2:
            self.log("Not enough numerical columns for scatter matrix")
            return

        pd.plotting.scatter_matrix(df[numeric_cols], figsize=(15, 15), alpha=0.6, diagonal='hist')
        plt.suptitle('Scatter Matrix', y=1.0)
        plt.tight_layout()
        plt.show()
In [13]:
# Agent 4 - Relationship Discovery Agent
# =============================================================================

class RelationshipDiscoveryAgent(BaseAgent):
    """Conducts correlation, mutual information, and regression analysis"""

    def __init__(self):
        super().__init__("Relationship Discovery")

    def _plot_helper(self, fig, title):
        """Helper method for displaying plots"""
        if isinstance(fig, plt.Figure):
            fig.suptitle(title, y=1.02)
            plt.tight_layout()
            plt.show()
        else:
            fig.update_layout(title_text=title)
            fig.show()
        self.log(f"Plot '{title}' generated")

    def compute_correlations(self, df: pd.DataFrame, plot_heatmap: bool = False) -> Dict:
        self.log("Computing correlation matrices...")
        numeric_df = df.select_dtypes(include=[np.number])

        if numeric_df.shape[1] < 2:
            self.log("Not enough numerical columns for correlation")
            return {}

        results = {
            'pearson': numeric_df.corr(method='pearson').to_dict(),
            'spearman': numeric_df.corr(method='spearman').to_dict()
        }

        if plot_heatmap:
            self.plot_correlation_heatmap(df)

        self.save_result('correlations', results)
        self.log("Correlation analysis complete")
        return results

    def plot_correlation_heatmap(self, df: pd.DataFrame):
        self.log("Plotting correlation heatmap...")
        numeric_df = df.select_dtypes(include=[np.number])

        if numeric_df.shape[1] < 2:
            self.log("Not enough numerical columns for correlation heatmap")
            return

        plt.figure(figsize=(12, 10))
        sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f',
                   cmap='coolwarm', center=0, square=True)
        self._plot_helper(plt.gcf(), 'Pearson Correlation Heatmap')

    def plot_pairplot(self, df: pd.DataFrame, max_cols: int = 5):
        self.log("Plotting pairplot...")
        numeric_cols = df.select_dtypes(include=[np.number]).columns[:max_cols]

        if len(numeric_cols) < 2:
            self.log("Not enough numerical columns for pairplot")
            return

        fig = sns.pairplot(df[numeric_cols])
        self._plot_helper(fig.fig, 'Pair Plot')

    def plot_categorical_vs_numerical(self, df: pd.DataFrame,
                                     categorical_col: str = None,
                                     numerical_col: str = None):
        self.log("Plotting categorical vs numerical relationship...")
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        numerical_cols = df.select_dtypes(include=[np.number]).columns

        if not categorical_col and len(categorical_cols) > 0:
            categorical_col = categorical_cols[0]
        if not numerical_col and len(numerical_cols) > 0:
            numerical_col = numerical_cols[0]

        if not categorical_col or not numerical_col:
            self.log("Not enough categorical or numerical columns for plotting")
            return

        fig = px.box(df, x=categorical_col, y=numerical_col)
        self._plot_helper(fig, f'{numerical_col} by {categorical_col}')

    def plot_boxplots_categorical_vs_numerical(self, df: pd.DataFrame):
        """Plot box plots for all categorical vs numerical combinations"""
        self.log("Plotting box plots for categorical vs numerical...")
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        numerical_cols = df.select_dtypes(include=[np.number]).columns

        if len(categorical_cols) == 0 or len(numerical_cols) == 0:
            self.log("Not enough categorical or numerical columns")
            return

        for cat_col in categorical_cols[:2]:  # Limit to first 2 categorical columns
            for num_col in numerical_cols[:2]:  # Limit to first 2 numerical columns
                fig = px.box(df, x=cat_col, y=num_col,
                           title=f'{num_col} by {cat_col}')
                fig.show()

        self.log("Box plots for categorical vs numerical generated")

    def perform_regression_analysis(self, df: pd.DataFrame, target_col: str = None) -> Dict:
        self.log("Performing regression analysis...")
        numeric_df = df.select_dtypes(include=[np.number]).dropna()

        if target_col is None:
            target_col = numeric_df.columns[-1]

        if target_col not in numeric_df.columns:
            self.log(f"Target column {target_col} not found")
            return {}

        X = numeric_df.drop(columns=[target_col])
        y = numeric_df[target_col]

        model = LinearRegression()
        model.fit(X, y)
        y_pred = model.predict(X)
        r2 = r2_score(y, y_pred)

        results = {
            'target': target_col,
            'features': list(X.columns),
            'coefficients': dict(zip(X.columns, model.coef_)),
            'intercept': float(model.intercept_),
            'r2_score': float(r2)
        }

        plt.figure(figsize=Config.FIGURE_SIZE)
        plt.scatter(y, y_pred, alpha=0.5)
        plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        self._plot_helper(plt.gcf(), f'Linear Regression: R² = {r2:.3f}')

        self.save_result('regression', results)
        return results

    def identify_strong_relationships(self, df: pd.DataFrame, threshold: float = Config.CORRELATION_THRESHOLD) -> List:
        """Identify strong relationships between variables"""
        self.log(f"Identifying relationships above threshold {threshold}...")

        numeric_df = df.select_dtypes(include=[np.number])
        corr_matrix = numeric_df.corr()

        strong_relationships = []

        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) >= threshold:
                    strong_relationships.append({
                        'var1': corr_matrix.columns[i],
                        'var2': corr_matrix.columns[j],
                        'correlation': corr_val,
                        'strength': 'strong' if abs(corr_val) > 0.7 else 'moderate'
                    })

        self.log(f"Found {len(strong_relationships)} strong relationships")
        return strong_relationships

    def compute_mutual_information(self, df: pd.DataFrame, target_col: str = None) -> Dict:
        """Compute mutual information scores"""
        self.log("Computing mutual information...")

        numeric_df = df.select_dtypes(include=[np.number]).dropna()

        if target_col is None:
            target_col = numeric_df.columns[-1]

        if target_col not in numeric_df.columns:
            self.log(f"Target column {target_col} not found")
            return {}

        X = numeric_df.drop(columns=[target_col])
        y = numeric_df[target_col]

        mi_scores = mutual_info_regression(X, y, random_state=42)
        mi_dict = dict(zip(X.columns, mi_scores))

        # Plot
        plt.figure(figsize=Config.FIGURE_SIZE)
        sorted_mi = sorted(mi_dict.items(), key=lambda x: x[1], reverse=True)
        features, scores = zip(*sorted_mi)
        plt.barh(features, scores)
        plt.xlabel('Mutual Information Score')
        plt.title(f'Mutual Information with {target_col}')
        plt.tight_layout()
        plt.show()

        self.save_result('mutual_information', mi_dict)
        self.log(f"Mutual information computed for {len(mi_dict)} features")
        return mi_dict
In [14]:
# Agent 5 - Data Storytelling Agent
# =============================================================================

class DataStorytellingAgent(BaseAgent):
    """Creates interactive visualizations and narratives"""

    def __init__(self):
        super().__init__("Data Storytelling")
        self.dashboard_app = None

    def generate_narrative_summary(self, df: pd.DataFrame, analysis_results: Dict = None) -> str:
        """Generate AI-powered narrative summary"""
        self.log("Generating narrative summary...")

        if analysis_results is None:
            analysis_results = {}

        context = f"""
        Dataset Summary:
        - Shape: {df.shape[0]} rows, {df.shape[1]} columns
        - Numerical columns: {df.select_dtypes(include=[np.number]).shape[1]}
        - Categorical columns: {df.select_dtypes(include=['object', 'category']).shape[1]}
        - Missing values: {df.isnull().sum().sum()}

        Analysis Results:
        {json.dumps(analysis_results, indent=2, default=str)[:1000]}

        Generate a concise narrative summary highlighting key findings and insights.
        """

        try:
            narrative = self.get_llm_guidance(context)
            if not narrative or "API key not valid" in narrative:
                narrative = self._generate_fallback_narrative(df, analysis_results)
            self.save_result('narrative', narrative)
            self.log("Narrative summary generated")
            return narrative
        except Exception as e:
            self.log(f"Narrative generation failed: {e}")
            return self._generate_fallback_narrative(df, analysis_results)

    def generate_summary_insights(self, df: pd.DataFrame, analysis_results: Dict = None) -> str:
        """Alias for generate_narrative_summary"""
        self.log("Generating summary insights...")
        return self.generate_narrative_summary(df, analysis_results)

    def _generate_fallback_narrative(self, df: pd.DataFrame, analysis_results: Dict) -> str:
        """Generate basic narrative without LLM"""
        narrative = f"""
        ## Exploratory Data Analysis Summary

        ### Dataset Overview
        - Total Records: {df.shape[0]:,}
        - Total Features: {df.shape[1]}
        - Numerical Features: {df.select_dtypes(include=[np.number]).shape[1]}
        - Categorical Features: {df.select_dtypes(include=['object', 'category']).shape[1]}

        ### Data Quality
        - Missing Values: {df.isnull().sum().sum():,}
        - Duplicate Rows: {df.duplicated().sum()}

        ### Key Findings
        The dataset has been analyzed successfully. Review the visualizations and
        statistical summaries for detailed insights.
        """
        return narrative

    def create_interactive_dashboard(self, df: pd.DataFrame, analysis_results: Dict) -> Optional[Dash]:
        """Create interactive Dash dashboard"""
        if not DASH_AVAILABLE:
            self.log("Dash not available. Cannot create dashboard.")
            return None

        self.log("Creating interactive dashboard...")

        app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

        app.layout = dbc.Container([
            dbc.Row([
                dbc.Col(html.H1("🔍 Multi-Agent EDA Dashboard", className="text-center mb-4"))
            ]),

            # Summary Statistics Cards
            dbc.Row([
                dbc.Col([
                    dbc.Card([
                        dbc.CardBody([
                            html.H4("📊 Dataset Overview", className="card-title"),
                            html.P(f"Rows: {df.shape[0]:,}"),
                            html.P(f"Columns: {df.shape[1]}"),
                            html.P(f"Numerical: {len(numeric_cols)}"),
                            html.P(f"Categorical: {len(categorical_cols)}"),
                        ])
                    ])
                ], width=3),

                dbc.Col([
                    dbc.Card([
                        dbc.CardBody([
                            html.H4("🎯 Data Quality", className="card-title"),
                            html.P(f"Missing Values: {df.isnull().sum().sum():,}"),
                            html.P(f"Duplicates: {df.duplicated().sum()}"),
                            html.P(f"Memory: {df.memory_usage(deep=True).sum()/1024**2:.2f} MB"),
                        ])
                    ])
                ], width=3),

                dbc.Col([
                    dbc.Card([
                        dbc.CardBody([
                            html.H4("🔗 Relationships", className="card-title"),
                            html.P("Analysis Complete"),
                            html.P("Correlations Computed"),
                            html.P("Patterns Identified"),
                        ])
                    ])
                ], width=3),

                dbc.Col([
                    dbc.Card([
                        dbc.CardBody([
                            html.H4("✨ Insights", className="card-title"),
                            html.P("Clusters Found"),
                            html.P("Outliers Detected"),
                            html.P("Ready for Action"),
                        ])
                    ])
                ], width=3),
            ], className="mb-4"),

            # Interactive Plots
            dbc.Row([
                dbc.Col([
                    html.H4("Select Variables for Analysis"),
                    html.Label("X-Axis:"),
                    dcc.Dropdown(
                        id='x-axis-dropdown',
                        options=[{'label': col, 'value': col} for col in numeric_cols],
                        value=numeric_cols[0] if numeric_cols else None
                    ),
                    html.Label("Y-Axis:", className="mt-2"),
                    dcc.Dropdown(
                        id='y-axis-dropdown',
                        options=[{'label': col, 'value': col} for col in numeric_cols],
                        value=numeric_cols[1] if len(numeric_cols) > 1 else None
                    ),
                    html.Label("Color By:", className="mt-2"),
                    dcc.Dropdown(
                        id='color-dropdown',
                        options=[{'label': col, 'value': col} for col in categorical_cols],
                        value=categorical_cols[0] if categorical_cols else None
                    ),
                ], width=3),

                dbc.Col([
                    dcc.Graph(id='scatter-plot')
                ], width=9),
            ], className="mb-4"),

            dbc.Row([
                dbc.Col([
                    dcc.Graph(id='histogram')
                ], width=6),
                dbc.Col([
                    dcc.Graph(id='box-plot')
                ], width=6),
            ]),
        ], fluid=True)

        # Callbacks
        @app.callback(
            Output('scatter-plot', 'figure'),
            [Input('x-axis-dropdown', 'value'),
             Input('y-axis-dropdown', 'value'),
             Input('color-dropdown', 'value')]
        )
        def update_scatter(x_col, y_col, color_col):
            if x_col and y_col:
                fig = px.scatter(df, x=x_col, y=y_col, color=color_col,
                               title=f'{y_col} vs {x_col}',
                               template='plotly_white')
                return fig
            return go.Figure()

        @app.callback(
            Output('histogram', 'figure'),
            [Input('x-axis-dropdown', 'value')]
        )
        def update_histogram(x_col):
            if x_col:
                fig = px.histogram(df, x=x_col, title=f'Distribution of {x_col}',
                                 template='plotly_white')
                return fig
            return go.Figure()

        @app.callback(
            Output('box-plot', 'figure'),
            [Input('y-axis-dropdown', 'value')]
        )
        def update_boxplot(y_col):
            if y_col:
                fig = px.box(df, y=y_col, title=f'Box Plot of {y_col}',
                           template='plotly_white')
                return fig
            return go.Figure()

        self.dashboard_app = app
        self.log("Dashboard created successfully")
        return app

    def create_summary_report(self, df: pd.DataFrame, analysis_results: Dict) -> str:
        """Create comprehensive HTML report"""
        self.log("Creating summary report...")

        report_html = f"""
        <html>
        <head>
            <title>EDA Report</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }}
                h1 {{ color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; }}
                h2 {{ color: #34495e; border-bottom: 2px solid #3498db; margin-top: 30px; }}
                .metric {{ background: #ffffff; padding: 20px; margin: 10px 0; border-radius: 8px;
                          box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
                table {{ border-collapse: collapse; width: 100%; margin-top: 15px; }}
                th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
                th {{ background-color: #3498db; color: white; font-weight: bold; }}
                tr:nth-child(even) {{ background-color: #f2f2f2; }}
                .summary-box {{ background: #e8f4f8; padding: 15px; border-left: 4px solid #3498db;
                               margin: 20px 0; }}
            </style>
        </head>
        <body>
            <h1>🔍 Multi-Agent EDA Report</h1>

            <div class="summary-box">
                <strong>Generated:</strong> {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
            </div>

            <div class="metric">
                <h2>📊 Dataset Overview</h2>
                <p><strong>Total Rows:</strong> {df.shape[0]:,}</p>
                <p><strong>Total Columns:</strong> {df.shape[1]}</p>
                <p><strong>Numerical Columns:</strong> {df.select_dtypes(include=[np.number]).shape[1]}</p>
                <p><strong>Categorical Columns:</strong> {df.select_dtypes(include=['object', 'category']).shape[1]}</p>
                <p><strong>Memory Usage:</strong> {df.memory_usage(deep=True).sum()/1024**2:.2f} MB</p>
            </div>

            <div class="metric">
                <h2>📈 Summary Statistics</h2>
                {df.describe().to_html()}
            </div>

            <div class="metric">
                <h2>🎯 Data Quality Metrics</h2>
                <p><strong>Missing Values:</strong> {df.isnull().sum().sum():,}</p>
                <p><strong>Duplicate Rows:</strong> {df.duplicated().sum()}</p>
                <p><strong>Columns with Missing Data:</strong> {(df.isnull().sum() > 0).sum()}</p>
            </div>

            <div class="metric">
                <h2>🔬 Analysis Results</h2>
                <pre style="background: #f8f8f8; padding: 15px; border-radius: 5px; overflow-x: auto;">
{json.dumps(analysis_results, indent=2, default=str)[:2000]}
                </pre>
            </div>
        </body>
        </html>
        """

        return report_html
In [15]:
class LLMOrchestratorAgent(BaseAgent):
    """Orchestrates the entire EDA workflow using Gemini"""

    def __init__(self):
        super().__init__("LLM Orchestrator", Config.MODEL_NAME)
        self.agents = {}
        self.workflow_plan = []
        self.results = {}

    def initialize_agents(self):
        self.log("Initializing specialized agents...")
        self.agents = {
            'normalizer': JSONXMLNormalizerAgent(),
            'quality': DataQualityAgent(),
            'exploration': DataExplorationAgent(),
            'relationships': RelationshipDiscoveryAgent(),
            'storytelling': DataStorytellingAgent()
        }
        self.log(f"Initialized {len(self.agents)} agents")

    def analyze_dataset_requirements(self, df: pd.DataFrame) -> Dict:
        """Analyze dataset and determine analysis requirements"""
        self.log("Analyzing dataset requirements...")

        requirements = {
            'shape': df.shape,
            'has_numerical': df.select_dtypes(include=[np.number]).shape[1] > 0,
            'has_categorical': df.select_dtypes(include=['object', 'category']).shape[1] > 0,
            'has_missing': df.isnull().sum().sum() > 0,
            'has_duplicates': df.duplicated().sum() > 0,
            'recommended_analyses': []
        }

        # Determine recommended analyses
        if requirements['has_numerical']:
            requirements['recommended_analyses'].extend([
                'summary_statistics', 'distributions', 'correlations',
                'clustering', 'regression'
            ])

        if requirements['has_categorical']:
            requirements['recommended_analyses'].extend([
                'categorical_analysis', 'groupby_analysis'
            ])

        if requirements['has_missing'] or requirements['has_duplicates']:
            requirements['recommended_analyses'].insert(0, 'data_quality')

        return requirements

    def _validate_workflow(self, workflow_steps: List[Dict]) -> bool:
        """Validates if all actions in a workflow exist in their respective agents."""
        if not self.agents:
            self.initialize_agents() # Ensure agents are initialized for validation

        for step in workflow_steps:
            agent_name = step.get('agent')
            action = step.get('action')
            if not agent_name or not action:
                self.log(f"Validation failed: Invalid workflow step: missing agent or action in {step}")
                return False
            agent = self.agents.get(agent_name)
            if not agent:
                self.log(f"Validation failed: Agent '{agent_name}' not found for action '{action}'")
                return False
            if not hasattr(agent, action) or not callable(getattr(agent, action)):
                self.log(f"Validation failed: Agent '{agent_name}' has no callable method '{action}'")
                # For debugging: List available methods
                # self.log(f"  Available methods for {agent_name}: {[m for m in dir(agent) if not m.startswith('_') and callable(getattr(agent, m))]}")
                return False
        return True

    def create_workflow_plan(self, df: pd.DataFrame) -> List[Dict]:
        """Create intelligent workflow plan using LLM or rule-based fallback"""
        self.log("Creating workflow plan...")

        # Ensure agents are initialized before LLM planning or rule-based fallback
        if not self.agents:
            self.initialize_agents()

        requirements = self.analyze_dataset_requirements(df)

        # Define the rule-based fallback workflow here to ensure it's always available
        rule_based_workflow = [
            # Data Quality & Preprocessing Steps (High Priority)
            {'agent': 'quality', 'action': 'assess_data_quality', 'priority': 5},
            {'agent': 'quality', 'action': 'handle_duplicates', 'priority': 5, 'parameters': {}},
            {'agent': 'quality', 'action': 'handle_missing_values', 'priority': 5, 'parameters': {'strategy': 'smart'}},
            {'agent': 'quality', 'action': 'identify_data_types', 'priority': 4},
            {'agent': 'quality', 'action': 'encode_categorical_features', 'priority': 4, 'parameters': {'method': 'label'}},
            {'agent': 'quality', 'action': 'normalize_data', 'priority': 4},
            {'agent': 'quality', 'action': 'detect_outliers', 'priority': 3, 'parameters': {'method': 'zscore'}},

            # Data Exploration Steps (Medium Priority)
            {'agent': 'exploration', 'action': 'generate_summary_statistics', 'priority': 4},
            {'agent': 'exploration', 'action': 'plot_distributions', 'priority': 3},
            {'agent': 'exploration', 'action': 'plot_boxplots', 'priority': 3},
            {'agent': 'exploration', 'action': 'plot_countplots', 'priority': 3},
            {'agent': 'exploration', 'action': 'perform_clustering', 'priority': 3},
            {'agent': 'exploration', 'action': 'create_scatter_matrix', 'priority': 2},


            # Relationship Analysis Steps (Medium-Low Priority)
            {'agent': 'relationships', 'action': 'compute_correlations', 'priority': 3, 'parameters': {'plot_heatmap': True}},
            {'agent': 'relationships', 'action': 'plot_pairplot', 'priority': 2},
            {'agent': 'relationships', 'action': 'plot_boxplots_categorical_vs_numerical', 'priority': 2},
            {'agent': 'relationships', 'action': 'identify_strong_relationships', 'priority': 2},
            # Removed target_col for generic fallback
            {'agent': 'relationships', 'action': 'perform_regression_analysis', 'priority': 2, 'parameters': {}},
            {'agent': 'relationships', 'action': 'compute_mutual_information', 'priority': 2, 'parameters': {}},

            # Storytelling Steps (Lowest Priority, but always included)
            {'agent': 'storytelling', 'action': 'generate_narrative_summary', 'priority': 1},
            {'agent': 'storytelling', 'action': 'create_summary_report', 'priority': 1},
            {'agent': 'storytelling', 'action': 'create_interactive_dashboard', 'priority': 1}
        ]

        # Temporarily use only rule-based workflow to fix AttributeError
        self.workflow_plan = sorted(rule_based_workflow, key=lambda x: x.get('priority', 0), reverse=True)
        self.log(f"Using rule-based workflow with {len(self.workflow_plan)} steps (LLM planning temporarily bypassed due to method naming issues)")
        return self.workflow_plan

    def execute_workflow(self, df: pd.DataFrame) -> Dict:
        self.log("=" * 60)
        self.log("EXECUTING MULTI-AGENT EDA WORKFLOW")
        self.log("=" * 60)

        if not self.agents:
            self.initialize_agents()

        if not self.workflow_plan:
            # This path is now mostly for first run or if explicitly cleared
            self.create_workflow_plan(df)

        execution_results = {
            'steps_completed': [],
            'steps_failed': [],
            'agent_results': {}
        }

        current_df = df.copy()

        for step in self.workflow_plan:
            agent_name = step['agent']
            action = step['action']

            try:
                self.log(f"\n>>> Executing: {agent_name}.{action}")
                agent = self.agents.get(agent_name)

                if not agent:
                    raise ValueError(f"Agent '{agent_name}' not found")

                if not hasattr(agent, action):
                    raise AttributeError(f"Agent '{agent_name}' has no method '{action}'")

                method = getattr(agent, action)

                # Get parameters from the workflow step, default to an empty dict if not present
                params = step.get('parameters', {})

                # Handle methods that modify the DataFrame
                if action in ['handle_missing_values', 'handle_duplicates', 'normalize_data', 'encode_categorical_features']:
                    result = method(current_df, **params) # Pass parameters here
                    if isinstance(result, pd.DataFrame):
                        current_df = result
                        self.log(f"DataFrame updated: {current_df.shape}")
                # Handle methods that need analysis_results
                elif action in ['generate_summary_insights', 'generate_narrative_summary', 'create_interactive_dashboard', 'create_summary_report']:
                    result = method(current_df, execution_results.get('agent_results', {}), **params) # Pass parameters here
                else:
                    result = method(current_df, **params) # Pass parameters here

                execution_results['steps_completed'].append({
                    'agent': agent_name,
                    'action': action,
                    'status': 'success'
                })

                if agent_name not in execution_results['agent_results']:
                    execution_results['agent_results'][agent_name] = {}

                # Store result (avoid storing large DataFrames)
                if isinstance(result, pd.DataFrame):
                    result_summary = f"DataFrame: {result.shape}"
                elif isinstance(result, dict):
                    result_summary = result
                elif isinstance(result, str):
                    result_summary = result[:500] + "..." if len(result) > 500 else result
                else:
                    result_summary = str(result)[:500]

                execution_results['agent_results'][agent_name][action] = {
                    'result': result_summary
                }

                self.log(f"\u2713 {action} completed successfully")

            except Exception as e:
                import traceback
                error_detail = traceback.format_exc()
                self.log(f"\u2718 {action} failed: {e}")
                self.log(f"Error details: {error_detail}")
                execution_results['steps_failed'].append({
                    'agent': agent_name,
                    'action': action,
                    'error': str(e),
                    'details': error_detail
                })

        self.log("\n" + "=" * 60)
        self.log("WORKFLOW EXECUTION COMPLETE")
        self.log(f"Steps completed: {len(execution_results['steps_completed'])}")
        self.log(f"Steps failed: {len(execution_results['steps_failed'])}")

        if execution_results['steps_failed']:
            self.log("\nFailed steps:")
            for failed in execution_results['steps_failed']:
                self.log(f"  - {failed['agent']}.{failed['action']}: {failed['error']}")

        self.log("=" * 60)

        self.results = execution_results
        return execution_results
In [16]:
# Main Execution Pipeline
# =============================================================================

class MultiAgentEDASystem:
    def __init__(self):
        self.orchestrator = LLMOrchestratorAgent()
        self.raw_data = None
        self.results = None

    def load_data(self, data_source: Union[str, pd.DataFrame]) -> pd.DataFrame:
        print("📂 Loading data...")

        if isinstance(data_source, pd.DataFrame):
            self.raw_data = data_source
        elif isinstance(data_source, str) and data_source.endswith('.csv'):
            self.raw_data = pd.read_csv(data_source)
        else:
            raise ValueError("Unsupported data source")

        print(f"✓ Data loaded: {self.raw_data.shape[0]} rows, {self.raw_data.shape[1]} columns")
        return self.raw_data

    def run_analysis(self, data_source: Union[str, pd.DataFrame]) -> Dict:
        df = self.load_data(data_source)
        self.orchestrator.initialize_agents()
        self.orchestrator.create_workflow_plan(df)
        self.results = self.orchestrator.execute_workflow(df)
        return self.results

    def get_dashboard(self) -> Optional[Dash]:
        """Get the interactive dashboard"""
        if self.orchestrator.agents.get('storytelling'):
            return self.orchestrator.agents['storytelling'].dashboard_app
        return None

    def export_results(self, output_path: str = 'eda_results.json'):
        """Export analysis results to JSON"""
        with open(output_path, 'w') as f:
            json.dump(self.results, f, indent=2, default=str)
        print(f"✓ Results exported to {output_path}")
In [17]:
# Test Casess
#

def run_test_cases(sample_data: pd.DataFrame) -> Dict:
    """Run comprehensive test cases"""
    print("\n" + "="*60)
    print("RUNNING TEST CASES")
    print("="*60 + "\n")

    test_results = {
        'passed': 0,
        'failed': 0,
        'tests': []
    }

    # Test 1: Data Loading
    print("Test 1: Data Loading...")
    try:
        test_system = MultiAgentEDASystem()
        test_df = test_system.load_data(sample_data)
        assert test_df.shape[0] > 0, "Data should not be empty"
        assert test_df.shape[1] > 0, "Data should have columns"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Data Loading', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Data Loading', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 2: Data Quality Assessment
    print("\nTest 2: Data Quality Assessment...")
    try:
        quality_agent = DataQualityAgent()
        quality_report = quality_agent.assess_data_quality(sample_data)
        assert 'shape' in quality_report, "Quality report should contain shape"
        assert 'missing_values' in quality_report, "Quality report should contain missing values"
        assert quality_report['shape'] == sample_data.shape, "Shape should match"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Data Quality Assessment', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Data Quality Assessment', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 3: Data Type Identification
    print("\nTest 3: Data Type Identification...")
    try:
        quality_agent = DataQualityAgent()
        dtypes_info = quality_agent.identify_data_types(sample_data)
        assert 'numerical' in dtypes_info, "Should identify numerical columns"
        assert 'categorical' in dtypes_info, "Should identify categorical columns"
        assert isinstance(dtypes_info['numerical'], list), "Numerical should be a list"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Data Type Identification', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Data Type Identification', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 4: Summary Statistics
    print("\nTest 4: Summary Statistics...")
    try:
        exploration_agent = DataExplorationAgent()
        stats = exploration_agent.generate_summary_statistics(sample_data)
        assert 'numerical_summary' in stats, "Should contain numerical summary"
        assert 'categorical_summary' in stats, "Should contain categorical summary"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Summary Statistics', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Summary Statistics', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 5: Outlier Detection
    print("\nTest 5: Outlier Detection...")
    try:
        exploration_agent = DataExplorationAgent()
        outliers = exploration_agent.identify_outliers(sample_data)
        assert isinstance(outliers, dict), "Should return a dictionary"
        numeric_cols = sample_data.select_dtypes(include=[np.number]).columns
        assert len(outliers) == len(numeric_cols), "Should detect outliers for all numeric columns"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Outlier Detection', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Outlier Detection', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 6: Correlation Analysis
    print("\nTest 6: Correlation Analysis...")
    try:
        relationship_agent = RelationshipDiscoveryAgent()
        correlations = relationship_agent.compute_correlations(sample_data)
        assert 'pearson' in correlations, "Should contain Pearson correlation"
        assert 'spearman' in correlations, "Should contain Spearman correlation"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Correlation Analysis', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Correlation Analysis', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 7: Clustering
    print("\nTest 7: Clustering...")
    try:
        exploration_agent = DataExplorationAgent()
        clustering_results = exploration_agent.perform_clustering(sample_data, n_clusters=3)
        assert 'n_clusters' in clustering_results, "Should contain cluster information"
        assert clustering_results['n_clusters'] == 3, "Should have 3 clusters"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Clustering', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Clustering', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 8: Regression Analysis
    print("\nTest 8: Regression Analysis...")
    try:
        relationship_agent = RelationshipDiscoveryAgent()
        regression_results = relationship_agent.perform_regression_analysis(sample_data)
        assert 'r2_score' in regression_results, "Should contain R² score"
        assert 'coefficients' in regression_results, "Should contain coefficients"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Regression Analysis', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Regression Analysis', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 9: Narrative Generation
    print("\nTest 9: Narrative Generation...")
    try:
        storytelling_agent = DataStorytellingAgent()
        narrative = storytelling_agent.generate_summary_insights(sample_data, {})
        assert isinstance(narrative, str), "Should return a string"
        assert len(narrative) > 0, "Narrative should not be empty"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'Narrative Generation', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'Narrative Generation', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    # Test 10: End-to-End Workflow
    print("\nTest 10: End-to-End Workflow...")
    try:
        eda_system = MultiAgentEDASystem()
        results = eda_system.run_analysis(sample_data)
        assert 'steps_completed' in results, "Should track completed steps"
        assert 'agent_results' in results, "Should contain agent results"
        assert len(results['steps_completed']) > 0, "Should complete some steps"
        test_results['passed'] += 1
        test_results['tests'].append({'test': 'End-to-End Workflow', 'status': 'PASSED'})
        print("✓ PASSED")
    except Exception as e:
        test_results['failed'] += 1
        test_results['tests'].append({'test': 'End-to-End Workflow', 'status': 'FAILED', 'error': str(e)})
        print(f"✗ FAILED: {e}")

    print("\n" + "="*60)
    print(f"TEST SUMMARY: {test_results['passed']} passed, {test_results['failed']} failed")
    print("="*60 + "\n")

    return test_results
In [18]:
# Evaluation Matrics Report
# =============================================================================

def generate_evaluation_report(results: Dict, test_results: Dict) -> str:
    """Generate comprehensive evaluation report"""
    print("\n" + "="*60)
    print("EVALUATION METRICS REPORT")
    print("="*60 + "\n")

    report = []

    # 1. Data Cleaning & Validation
    report.append("1. DATA CLEANING & VALIDATION")
    report.append("-" * 40)
    steps_completed = len(results.get('steps_completed', []))
    total_steps = steps_completed + len(results.get('steps_failed', []))
    accuracy = (steps_completed / total_steps * 100) if total_steps > 0 else 0
    report.append(f"   Workflow Completion Rate: {accuracy:.1f}%")
    report.append(f"   Steps Executed Successfully: {steps_completed}/{total_steps}")

    if 'quality' in results.get('agent_results', {}):
        report.append(f"   Data Quality Checks: ✓ Passed")

    # 2. Summary Statistics & Visualizations
    report.append("\n2. SUMMARY STATISTICS & VISUALIZATIONS")
    report.append("-" * 40)
    if 'exploration' in results.get('agent_results', {}):
        report.append(f"   Statistical Analysis: ✓ Complete")
        report.append(f"   Distribution Plots: ✓ Generated")
        report.append(f"   Clustering Analysis: ✓ Complete")

    # 3. Correlation & Regression Analysis
    report.append("\n3. CORRELATION & REGRESSION ANALYSIS")
    report.append("-" * 40)
    if 'relationships' in results.get('agent_results', {}):
        report.append(f"   Correlation Matrix: ✓ Computed")
        report.append(f"   Regression Models: ✓ Fitted")
        report.append(f"   Relationship Discovery: ✓ Complete")

    # 4. Interactive Visualizations
    report.append("\n4. INTERACTIVE VISUALIZATIONS")
    report.append("-" * 40)
    if 'storytelling' in results.get('agent_results', {}):
        report.append(f"   Dashboard Created: {'✓ Yes' if DASH_AVAILABLE else '✗ Dash Not Available'}")
        report.append(f"   Narrative Summary: ✓ Generated")
        report.append(f"   Interactive Components: {'✓ Enabled' if DASH_AVAILABLE else '✗ Disabled'}")

    # 5. Test Case Results
    report.append("\n5. TEST CASE RESULTS")
    report.append("-" * 40)
    total_tests = test_results['passed'] + test_results['failed']
    report.append(f"   Total Tests: {total_tests}")
    report.append(f"   Passed: {test_results['passed']}")
    report.append(f"   Failed: {test_results['failed']}")
    test_pass_rate = (test_results['passed'] / total_tests * 100) if total_tests > 0 else 0
    report.append(f"   Pass Rate: {test_pass_rate:.1f}%")

    # 6. Overall System Performance
    report.append("\n6. OVERALL SYSTEM PERFORMANCE")
    report.append("-" * 40)
    overall_score = (accuracy + test_pass_rate) / 2
    report.append(f"   Overall Score: {overall_score:.1f}/100")

    if overall_score >= 90:
        grade = "A (Excellent)"
    elif overall_score >= 80:
        grade = "B (Good)"
    elif overall_score >= 70:
        grade = "C (Satisfactory)"
    else:
        grade = "D (Needs Improvement)"

    report.append(f"   Grade: {grade}")

    # Print detailed test results
    report.append("\n7. DETAILED TEST RESULTS")
    report.append("-" * 40)
    for test in test_results['tests']:
        status_symbol = "✓" if test['status'] == 'PASSED' else "✗"
        report.append(f"   {status_symbol} {test['test']}: {test['status']}")
        if 'error' in test:
            report.append(f"      Error: {test['error']}")

    report.append("\n" + "="*60 + "\n")

    report_text = "\n".join(report)
    print(report_text)
    return report_text
In [19]:
# Cloud Run Deployment Setup
# =============================================================================
def generate_deployment_files():
    """Generate files needed for Google Cloud Run deployment"""

    # app.py for Cloud Run
    app_py = '''"""
Multi-Agent EDA System - Cloud Run Deployment
"""
import os
from main import MultiAgentEDASystem, Config
import pandas as pd

# Initialize system
eda_system = MultiAgentEDASystem()

# Load your data
# Option 1: From GCS
# from google.cloud import storage
# storage_client = storage.Client()
# bucket = storage_client.bucket(Config.GCS_BUCKET)
# blob = bucket.blob('data/your-data.csv')
# data = pd.read_csv(blob.open('rb'))

# Option 2: From local file (for testing)
data = pd.read_csv('sample_data.csv')

# Run analysis
results = eda_system.run_analysis(data)

# Get dashboard
app = eda_system.get_dashboard()

if __name__ == '__main__':
    port = int(os.environ.get('PORT', 8080))
    if app:
        app.run_server(host='0.0.0.0', port=port, debug=False)
    else:
        print("Dashboard not available. Please install dash and dash-bootstrap-components.")
'''

    # Dockerfile
    dockerfile = '''FROM python:3.9-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \\
    gcc \\
    g++ \\
    && rm -rf /var/lib/apt/lists/*

# Copy requirements first for better caching
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY . .

# Set environment variables
ENV PORT=8080
ENV PYTHONUNBUFFERED=1

EXPOSE 8080

# Run the application
CMD ["python", "app.py"]
'''

    # requirements.txt
    requirements = '''google-generativeai>=0.3.0
pandas>=1.5.0
numpy>=1.23.0
scikit-learn>=1.2.0
matplotlib>=3.6.0
seaborn>=0.12.0
plotly>=5.11.0
dash>=2.7.0
dash-bootstrap-components>=1.3.0
gunicorn>=20.1.0
google-cloud-storage>=2.7.0
google-cloud-bigquery>=3.4.0
'''

    # cloudbuild.yaml
    cloudbuild = '''steps:
  # Build the container image
  - name: 'gcr.io/cloud-builders/docker'
    args: ['build', '-t', 'gcr.io/$PROJECT_ID/eda-dashboard:$COMMIT_SHA', '.']

  # Push the container image to Container Registry
  - name: 'gcr.io/cloud-builders/docker'
    args: ['push', 'gcr.io/$PROJECT_ID/eda-dashboard:$COMMIT_SHA']

  # Deploy to Cloud Run
  - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
    entrypoint: gcloud
    args:
      - 'run'
      - 'deploy'
      - 'eda-dashboard'
      - '--image=gcr.io/$PROJECT_ID/eda-dashboard:$COMMIT_SHA'
      - '--region=us-central1'
      - '--platform=managed'
      - '--allow-unauthenticated'
      - '--memory=2Gi'
      - '--cpu=2'
      - '--timeout=3600'

images:
  - 'gcr.io/$PROJECT_ID/eda-dashboard:$COMMIT_SHA'
'''

    # deployment_guide.md
    deployment_guide = '''# Deployment Guide for Google Cloud Run

## Prerequisites
1. Google Cloud account with billing enabled
2. gcloud CLI installed and authenticated
3. Docker installed locally (for testing)

## Setup Steps

### 1. Set up Google Cloud Project
```bash
# Set your project ID
export PROJECT_ID=your-project-id
gcloud config set project $PROJECT_ID

# Enable required APIs
gcloud services enable run.googleapis.com
gcloud services enable containerregistry.googleapis.com
gcloud services enable cloudbuild.googleapis.com
```

### 2. Set Environment Variables
```bash
# Set your Gemini API key
export GEMINI_API_KEY=your-api-key

# Optional: Set GCS bucket for data storage
export GCS_BUCKET=your-bucket-name
```

### 3. Deploy Using Cloud Build (Recommended)
```bash
# Submit build to Cloud Build
gcloud builds submit --config cloudbuild.yaml

# Check deployment status
gcloud run services describe eda-dashboard --region=us-central1
```

### 4. Alternative: Manual Deployment
```bash
# Build Docker image
docker build -t gcr.io/$PROJECT_ID/eda-dashboard .

# Push to Container Registry
docker push gcr.io/$PROJECT_ID/eda-dashboard

# Deploy to Cloud Run
gcloud run deploy eda-dashboard \\
  --image gcr.io/$PROJECT_ID/eda-dashboard \\
  --region us-central1 \\
  --platform managed \\
  --allow-unauthenticated \\
  --memory 2Gi \\
  --cpu 2 \\
  --timeout 3600 \\
  --set-env-vars GEMINI_API_KEY=$GEMINI_API_KEY
```

### 5. Test Locally (Optional)
```bash
# Build and run locally
docker build -t eda-dashboard .
docker run -p 8080:8080 \\
  -e GEMINI_API_KEY=$GEMINI_API_KEY \\
  eda-dashboard
```

## Configuration

### Memory and CPU Settings
- Default: 2 GB RAM, 2 vCPUs
- For larger datasets, increase to 4 GB RAM, 4 vCPUs
- Adjust timeout based on dataset size (default: 3600 seconds)

### Environment Variables
Set these in Cloud Run service settings:
- `GEMINI_API_KEY`: Your Gemini API key (required)
- `GCS_BUCKET`: GCS bucket for data storage (optional)
- `GCP_PROJECT_ID`: Your GCP project ID (optional)

## Monitoring and Logs

### View Logs
```bash
gcloud run services logs read eda-dashboard \\
  --region us-central1 \\
  --limit 50
```

### Monitor Performance
```bash
gcloud run services describe eda-dashboard \\
  --region us-central1 \\
  --format="value(status)"
```

## Cost Optimization
1. Use Cloud Run's autoscaling (min instances = 0)
2. Set appropriate memory limits
3. Use Cloud Build caching for faster builds
4. Monitor usage in Google Cloud Console

## Security Best Practices
1. Store API keys in Secret Manager
2. Use IAM roles for GCS access
3. Enable Cloud Armor for DDoS protection
4. Implement authentication for production

## Troubleshooting
- Check logs: `gcloud run services logs read eda-dashboard`
- Verify environment variables are set
- Ensure API keys are valid
- Check memory/CPU limits for large datasets
'''

    print("\n" + "="*60)
    print("GENERATING CLOUD RUN DEPLOYMENT FILES")
    print("="*60 + "\n")

    files = {
        'app.py': app_py,
        'Dockerfile': dockerfile,
        'requirements.txt': requirements,
        'cloudbuild.yaml': cloudbuild,
        'DEPLOYMENT_GUIDE.md': deployment_guide
    }

    for filename, content in files.items():
        with open(filename, 'w') as f:
            f.write(content)
        print(f"✓ Created: {filename}")

    print("\n" + "="*60)
    print("DEPLOYMENT FILES GENERATED SUCCESSFULLY")
    print("="*60)
    print("\nTo deploy to Cloud Run:")
    print("1. Review and configure the generated files")
    print("2. Set your GEMINI_API_KEY environment variable")
    print("3. Run: gcloud builds submit --config cloudbuild.yaml")
    print("\nFor detailed instructions, see DEPLOYMENT_GUIDE.md")
    print("="*60 + "\n")
In [20]:
# CELL 14: Usage Example
# =============================================================================

if __name__ == "__main__":
    # Create sample dataset
    np.random.seed(42)
    sample_data = pd.DataFrame({
        'customer_id': range(1, 1001),
        'age': np.random.randint(18, 80, 1000),
        'income': np.random.normal(50000, 20000, 1000),
        'purchase_amount': np.random.exponential(100, 1000),
        'num_purchases': np.random.poisson(5, 1000),
        'satisfaction_score': np.random.uniform(1, 5, 1000),
        'region': np.random.choice(['North', 'South', 'East', 'West'], 1000),
        'customer_type': np.random.choice(['New', 'Returning', 'VIP'], 1000),
        'product_category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Home'], 1000)
    })

    # Add missing values
    sample_data.loc[sample_data.sample(50).index, 'income'] = np.nan
    sample_data.loc[sample_data.sample(30).index, 'satisfaction_score'] = np.nan

    print("✓ Sample dataset created")
    print(sample_data.head())

    # Run EDA
    print("\n" + "="*60)
    print("STARTING MULTI-AGENT EDA SYSTEM")
    print("="*60 + "\n")

    eda_system = MultiAgentEDASystem()
    results = eda_system.run_analysis(sample_data)

    print("\n✅ Analysis complete!")

    # ====================
    # VIEW DASHBOARD
    # ====================
    print("\n" + "="*60)
    print("DASHBOARD SETUP")
    print("="*60 + "\n")

    # Create dashboard
    if DASH_AVAILABLE:
        storytelling_agent = eda_system.orchestrator.agents['storytelling']
        dashboard = storytelling_agent.create_interactive_dashboard(
            eda_system.raw_data,
            results['agent_results']
        )

        if dashboard:
            print("🚀 Dashboard created successfully!")
            print(f"To view dashboard, run:")
            print(f"  dashboard.run_server(debug=True, port={Config.DASH_PORT})")
            print(f"\nOr in Jupyter/Colab:")
            print(f"  dashboard.run_server(mode='inline', port={Config.DASH_PORT})")

            # Uncomment to run dashboard immediately
            # dashboard.run_server(debug=True, port=Config.DASH_PORT)
    else:
        print("⚠️  Dash not available. Install dash and dash-bootstrap-components to enable dashboard.")

    # Generate HTML report as alternative
    print("\n📄 Generating HTML report...")
    html_report = eda_system.orchestrator.agents['storytelling'].create_summary_report(
        eda_system.raw_data,
        results['agent_results']
    )
    with open('eda_summary_report.html', 'w') as f:
        f.write(html_report)
    print("✓ HTML report saved to: eda_summary_report.html")

    # ====================
    # RUN TEST CASES
    # ====================
    test_results = run_test_cases(sample_data)

    # ====================
    # EVALUATION REPORT
    # ====================
    evaluation_report = generate_evaluation_report(results, test_results)

    # Save evaluation report
    with open('evaluation_report.txt', 'w') as f:
        f.write(evaluation_report)
    print("✓ Evaluation report saved to: evaluation_report.txt")

    # ====================
    # CLOUD DEPLOYMENT
    # ====================
    print("\n" + "="*60)
    print("CLOUD RUN DEPLOYMENT SETUP")
    print("="*60 + "\n")

    generate_choice = input("Generate Cloud Run deployment files? (yes/no): ").lower()
    if generate_choice == 'yes':
        generate_deployment_files()
    else:
        print("Skipping deployment file generation.")

    # Export results
    eda_system.export_results('eda_results.json')

    print("\n" + "="*60)
    print("ALL PROCESSES COMPLETED SUCCESSFULLY")
    print("="*60)
    print("\nGenerated Files:")
    print("  1. eda_results.json - Complete analysis results")
    print("  2. eda_summary_report.html - Visual HTML report")
    print("  3. evaluation_report.txt - Performance metrics")
    if generate_choice == 'yes':
        print("  4. Deployment files (app.py, Dockerfile, etc.)")
    print("\n" + "="*60)
✓ Sample dataset created
   customer_id  age        income  purchase_amount  num_purchases  \
0            1   56  77447.555931       266.382594              7   
1            2   69  36444.278140        29.176832              6   
2            3   46  73070.637440       151.946204              3   
3            4   32  42499.787433       174.372500              3   
4            5   60  36117.080930        68.409251              8   

   satisfaction_score region customer_type product_category  
0            1.105191   West           New         Clothing  
1            2.044115  South           New      Electronics  
2            3.152270  North     Returning         Clothing  
3            4.630466   West     Returning      Electronics  
4            4.385741   East     Returning             Food  

============================================================
STARTING MULTI-AGENT EDA SYSTEM
============================================================

📂 Loading data...
✓ Data loaded: 1000 rows, 9 columns
[LLM Orchestrator] Initializing specialized agents...
[LLM Orchestrator] Initialized 5 agents
[LLM Orchestrator] Creating workflow plan...
[LLM Orchestrator] Analyzing dataset requirements...
[LLM Orchestrator] Using rule-based workflow with 22 steps (LLM planning temporarily bypassed due to method naming issues)
[LLM Orchestrator] ============================================================
[LLM Orchestrator] EXECUTING MULTI-AGENT EDA WORKFLOW
[LLM Orchestrator] ============================================================
[LLM Orchestrator] 
>>> Executing: quality.assess_data_quality
[Data Quality] Assessing data quality...
[Data Quality] Quality assessment complete: 1000 rows, 9 columns
[LLM Orchestrator] ✓ assess_data_quality completed successfully
[LLM Orchestrator] 
>>> Executing: quality.handle_duplicates
[Data Quality] Handling duplicate rows...
[Data Quality] Dropped 0 duplicate rows
[LLM Orchestrator] DataFrame updated: (1000, 9)
[LLM Orchestrator] ✓ handle_duplicates completed successfully
[LLM Orchestrator] 
>>> Executing: quality.handle_missing_values
[Data Quality] Handling missing values with strategy: smart
[Data Quality] Missing values handled: 9 columns remaining
[LLM Orchestrator] DataFrame updated: (1000, 9)
[LLM Orchestrator] ✓ handle_missing_values completed successfully
[LLM Orchestrator] 
>>> Executing: quality.identify_data_types
[Data Quality] Identifying data types...
[Data Quality] Found 6 numerical, 3 categorical columns
[LLM Orchestrator] ✓ identify_data_types completed successfully
[LLM Orchestrator] 
>>> Executing: quality.encode_categorical_features
[Data Quality] Encoding categorical features using label encoding...
[Data Quality] Encoded region: 4 unique values
[Data Quality] Encoded customer_type: 3 unique values
[Data Quality] Encoded product_category: 4 unique values
[Data Quality] Categorical encoding complete. New shape: (1000, 12)
[LLM Orchestrator] DataFrame updated: (1000, 12)
[LLM Orchestrator] ✓ encode_categorical_features completed successfully
[LLM Orchestrator] 
>>> Executing: quality.normalize_data
[Data Quality] Normalizing numerical features...
[Data Quality] Normalized 9 numerical columns
[LLM Orchestrator] DataFrame updated: (1000, 12)
[LLM Orchestrator] ✓ normalize_data completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.generate_summary_statistics
[Data Exploration] Generating summary statistics...
[Data Exploration] Summary statistics generated
[LLM Orchestrator] ✓ generate_summary_statistics completed successfully
[LLM Orchestrator] 
>>> Executing: quality.detect_outliers
[Data Quality] Detecting outliers using zscore method...
[Data Quality] Outlier detection complete for 9 columns
[LLM Orchestrator] ✓ detect_outliers completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.plot_distributions
[Data Exploration] Plotting distributions...
No description has been provided for this image
[LLM Orchestrator] ✓ plot_distributions completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.plot_boxplots
[Data Exploration] Plotting box plots...
[LLM Orchestrator] ✓ plot_boxplots completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.plot_countplots
[Data Exploration] Plotting count plots...
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
[LLM Orchestrator] ✓ plot_countplots completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.perform_clustering
[Data Exploration] Performing K-means clustering...
[Data Exploration] Optimal number of clusters: 9
No description has been provided for this image
[LLM Orchestrator] ✓ perform_clustering completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.compute_correlations
[Relationship Discovery] Computing correlation matrices...
[Relationship Discovery] Plotting correlation heatmap...
No description has been provided for this image
[Relationship Discovery] Plot 'Pearson Correlation Heatmap' generated
[Relationship Discovery] Correlation analysis complete
[LLM Orchestrator] ✓ compute_correlations completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.create_scatter_matrix
[Data Exploration] Creating scatter matrix...
No description has been provided for this image
[LLM Orchestrator] ✓ create_scatter_matrix completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.plot_pairplot
[Relationship Discovery] Plotting pairplot...
No description has been provided for this image
[Relationship Discovery] Plot 'Pair Plot' generated
[LLM Orchestrator] ✓ plot_pairplot completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.plot_boxplots_categorical_vs_numerical
[Relationship Discovery] Plotting box plots for categorical vs numerical...
[Relationship Discovery] Box plots for categorical vs numerical generated
[LLM Orchestrator] ✓ plot_boxplots_categorical_vs_numerical completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.identify_strong_relationships
[Relationship Discovery] Identifying relationships above threshold 0.5...
[Relationship Discovery] Found 0 strong relationships
[LLM Orchestrator] ✓ identify_strong_relationships completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.perform_regression_analysis
[Relationship Discovery] Performing regression analysis...
No description has been provided for this image
[Relationship Discovery] Plot 'Linear Regression: R² = 0.002' generated
[LLM Orchestrator] ✓ perform_regression_analysis completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.compute_mutual_information
[Relationship Discovery] Computing mutual information...
No description has been provided for this image
[Relationship Discovery] Mutual information computed for 8 features
[LLM Orchestrator] ✓ compute_mutual_information completed successfully
[LLM Orchestrator] 
>>> Executing: storytelling.generate_narrative_summary
[Data Storytelling] Generating narrative summary...
[Data Storytelling] Narrative summary generated
[LLM Orchestrator] ✓ generate_narrative_summary completed successfully
[LLM Orchestrator] 
>>> Executing: storytelling.create_summary_report
[Data Storytelling] Creating summary report...
[LLM Orchestrator] ✓ create_summary_report completed successfully
[LLM Orchestrator] 
>>> Executing: storytelling.create_interactive_dashboard
[Data Storytelling] Creating interactive dashboard...
[Data Storytelling] Dashboard created successfully
[LLM Orchestrator] ✓ create_interactive_dashboard completed successfully
[LLM Orchestrator] 
============================================================
[LLM Orchestrator] WORKFLOW EXECUTION COMPLETE
[LLM Orchestrator] Steps completed: 22
[LLM Orchestrator] Steps failed: 0
[LLM Orchestrator] ============================================================

✅ Analysis complete!

============================================================
DASHBOARD SETUP
============================================================

[Data Storytelling] Creating interactive dashboard...
[Data Storytelling] Dashboard created successfully
🚀 Dashboard created successfully!
To view dashboard, run:
  dashboard.run_server(debug=True, port=8050)

Or in Jupyter/Colab:
  dashboard.run_server(mode='inline', port=8050)

📄 Generating HTML report...
[Data Storytelling] Creating summary report...
✓ HTML report saved to: eda_summary_report.html

============================================================
RUNNING TEST CASES
============================================================

Test 1: Data Loading...
📂 Loading data...
✓ Data loaded: 1000 rows, 9 columns
✓ PASSED

Test 2: Data Quality Assessment...
[Data Quality] Assessing data quality...
[Data Quality] Quality assessment complete: 1000 rows, 9 columns
✓ PASSED

Test 3: Data Type Identification...
[Data Quality] Identifying data types...
[Data Quality] Found 6 numerical, 3 categorical columns
✓ PASSED

Test 4: Summary Statistics...
[Data Exploration] Generating summary statistics...
[Data Exploration] Summary statistics generated
✓ PASSED

Test 5: Outlier Detection...
[Data Exploration] Identifying outliers using zscore method...
[Data Exploration] Outlier identification complete for 6 columns
✓ PASSED

Test 6: Correlation Analysis...
[Relationship Discovery] Computing correlation matrices...
[Relationship Discovery] Correlation analysis complete
✓ PASSED

Test 7: Clustering...
[Data Exploration] Performing K-means clustering...
No description has been provided for this image
✓ PASSED

Test 8: Regression Analysis...
[Relationship Discovery] Performing regression analysis...
No description has been provided for this image
[Relationship Discovery] Plot 'Linear Regression: R² = 0.004' generated
✓ PASSED

Test 9: Narrative Generation...
[Data Storytelling] Generating summary insights...
[Data Storytelling] Generating narrative summary...
[Data Storytelling] Narrative summary generated
✓ PASSED

Test 10: End-to-End Workflow...
📂 Loading data...
✓ Data loaded: 1000 rows, 9 columns
[LLM Orchestrator] Initializing specialized agents...
[LLM Orchestrator] Initialized 5 agents
[LLM Orchestrator] Creating workflow plan...
[LLM Orchestrator] Analyzing dataset requirements...
[LLM Orchestrator] Using rule-based workflow with 22 steps (LLM planning temporarily bypassed due to method naming issues)
[LLM Orchestrator] ============================================================
[LLM Orchestrator] EXECUTING MULTI-AGENT EDA WORKFLOW
[LLM Orchestrator] ============================================================
[LLM Orchestrator] 
>>> Executing: quality.assess_data_quality
[Data Quality] Assessing data quality...
[Data Quality] Quality assessment complete: 1000 rows, 9 columns
[LLM Orchestrator] ✓ assess_data_quality completed successfully
[LLM Orchestrator] 
>>> Executing: quality.handle_duplicates
[Data Quality] Handling duplicate rows...
[Data Quality] Dropped 0 duplicate rows
[LLM Orchestrator] DataFrame updated: (1000, 9)
[LLM Orchestrator] ✓ handle_duplicates completed successfully
[LLM Orchestrator] 
>>> Executing: quality.handle_missing_values
[Data Quality] Handling missing values with strategy: smart
[Data Quality] Missing values handled: 9 columns remaining
[LLM Orchestrator] DataFrame updated: (1000, 9)
[LLM Orchestrator] ✓ handle_missing_values completed successfully
[LLM Orchestrator] 
>>> Executing: quality.identify_data_types
[Data Quality] Identifying data types...
[Data Quality] Found 6 numerical, 3 categorical columns
[LLM Orchestrator] ✓ identify_data_types completed successfully
[LLM Orchestrator] 
>>> Executing: quality.encode_categorical_features
[Data Quality] Encoding categorical features using label encoding...
[Data Quality] Encoded region: 4 unique values
[Data Quality] Encoded customer_type: 3 unique values
[Data Quality] Encoded product_category: 4 unique values
[Data Quality] Categorical encoding complete. New shape: (1000, 12)
[LLM Orchestrator] DataFrame updated: (1000, 12)
[LLM Orchestrator] ✓ encode_categorical_features completed successfully
[LLM Orchestrator] 
>>> Executing: quality.normalize_data
[Data Quality] Normalizing numerical features...
[Data Quality] Normalized 9 numerical columns
[LLM Orchestrator] DataFrame updated: (1000, 12)
[LLM Orchestrator] ✓ normalize_data completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.generate_summary_statistics
[Data Exploration] Generating summary statistics...
[Data Exploration] Summary statistics generated
[LLM Orchestrator] ✓ generate_summary_statistics completed successfully
[LLM Orchestrator] 
>>> Executing: quality.detect_outliers
[Data Quality] Detecting outliers using zscore method...
[Data Quality] Outlier detection complete for 9 columns
[LLM Orchestrator] ✓ detect_outliers completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.plot_distributions
[Data Exploration] Plotting distributions...
No description has been provided for this image
[LLM Orchestrator] ✓ plot_distributions completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.plot_boxplots
[Data Exploration] Plotting box plots...
[LLM Orchestrator] ✓ plot_boxplots completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.plot_countplots
[Data Exploration] Plotting count plots...
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
[LLM Orchestrator] ✓ plot_countplots completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.perform_clustering
[Data Exploration] Performing K-means clustering...
[Data Exploration] Optimal number of clusters: 9
No description has been provided for this image
[LLM Orchestrator] ✓ perform_clustering completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.compute_correlations
[Relationship Discovery] Computing correlation matrices...
[Relationship Discovery] Plotting correlation heatmap...
No description has been provided for this image
[Relationship Discovery] Plot 'Pearson Correlation Heatmap' generated
[Relationship Discovery] Correlation analysis complete
[LLM Orchestrator] ✓ compute_correlations completed successfully
[LLM Orchestrator] 
>>> Executing: exploration.create_scatter_matrix
[Data Exploration] Creating scatter matrix...
No description has been provided for this image
[LLM Orchestrator] ✓ create_scatter_matrix completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.plot_pairplot
[Relationship Discovery] Plotting pairplot...
No description has been provided for this image
[Relationship Discovery] Plot 'Pair Plot' generated
[LLM Orchestrator] ✓ plot_pairplot completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.plot_boxplots_categorical_vs_numerical
[Relationship Discovery] Plotting box plots for categorical vs numerical...
[Relationship Discovery] Box plots for categorical vs numerical generated
[LLM Orchestrator] ✓ plot_boxplots_categorical_vs_numerical completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.identify_strong_relationships
[Relationship Discovery] Identifying relationships above threshold 0.5...
[Relationship Discovery] Found 0 strong relationships
[LLM Orchestrator] ✓ identify_strong_relationships completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.perform_regression_analysis
[Relationship Discovery] Performing regression analysis...
No description has been provided for this image
[Relationship Discovery] Plot 'Linear Regression: R² = 0.002' generated
[LLM Orchestrator] ✓ perform_regression_analysis completed successfully
[LLM Orchestrator] 
>>> Executing: relationships.compute_mutual_information
[Relationship Discovery] Computing mutual information...
No description has been provided for this image
[Relationship Discovery] Mutual information computed for 8 features
[LLM Orchestrator] ✓ compute_mutual_information completed successfully
[LLM Orchestrator] 
>>> Executing: storytelling.generate_narrative_summary
[Data Storytelling] Generating narrative summary...
[Data Storytelling] Narrative summary generated
[LLM Orchestrator] ✓ generate_narrative_summary completed successfully
[LLM Orchestrator] 
>>> Executing: storytelling.create_summary_report
[Data Storytelling] Creating summary report...
[LLM Orchestrator] ✓ create_summary_report completed successfully
[LLM Orchestrator] 
>>> Executing: storytelling.create_interactive_dashboard
[Data Storytelling] Creating interactive dashboard...
[Data Storytelling] Dashboard created successfully
[LLM Orchestrator] ✓ create_interactive_dashboard completed successfully
[LLM Orchestrator] 
============================================================
[LLM Orchestrator] WORKFLOW EXECUTION COMPLETE
[LLM Orchestrator] Steps completed: 22
[LLM Orchestrator] Steps failed: 0
[LLM Orchestrator] ============================================================
✓ PASSED

============================================================
TEST SUMMARY: 10 passed, 0 failed
============================================================


============================================================
EVALUATION METRICS REPORT
============================================================

1. DATA CLEANING & VALIDATION
----------------------------------------
   Workflow Completion Rate: 100.0%
   Steps Executed Successfully: 22/22
   Data Quality Checks: ✓ Passed

2. SUMMARY STATISTICS & VISUALIZATIONS
----------------------------------------
   Statistical Analysis: ✓ Complete
   Distribution Plots: ✓ Generated
   Clustering Analysis: ✓ Complete

3. CORRELATION & REGRESSION ANALYSIS
----------------------------------------
   Correlation Matrix: ✓ Computed
   Regression Models: ✓ Fitted
   Relationship Discovery: ✓ Complete

4. INTERACTIVE VISUALIZATIONS
----------------------------------------
   Dashboard Created: ✓ Yes
   Narrative Summary: ✓ Generated
   Interactive Components: ✓ Enabled

5. TEST CASE RESULTS
----------------------------------------
   Total Tests: 10
   Passed: 10
   Failed: 0
   Pass Rate: 100.0%

6. OVERALL SYSTEM PERFORMANCE
----------------------------------------
   Overall Score: 100.0/100
   Grade: A (Excellent)

7. DETAILED TEST RESULTS
----------------------------------------
   ✓ Data Loading: PASSED
   ✓ Data Quality Assessment: PASSED
   ✓ Data Type Identification: PASSED
   ✓ Summary Statistics: PASSED
   ✓ Outlier Detection: PASSED
   ✓ Correlation Analysis: PASSED
   ✓ Clustering: PASSED
   ✓ Regression Analysis: PASSED
   ✓ Narrative Generation: PASSED
   ✓ End-to-End Workflow: PASSED

============================================================

✓ Evaluation report saved to: evaluation_report.txt

============================================================
CLOUD RUN DEPLOYMENT SETUP
============================================================

Generate Cloud Run deployment files? (yes/no): yes

============================================================
GENERATING CLOUD RUN DEPLOYMENT FILES
============================================================

✓ Created: app.py
✓ Created: Dockerfile
✓ Created: requirements.txt
✓ Created: cloudbuild.yaml
✓ Created: DEPLOYMENT_GUIDE.md

============================================================
DEPLOYMENT FILES GENERATED SUCCESSFULLY
============================================================

To deploy to Cloud Run:
1. Review and configure the generated files
2. Set your GEMINI_API_KEY environment variable
3. Run: gcloud builds submit --config cloudbuild.yaml

For detailed instructions, see DEPLOYMENT_GUIDE.md
============================================================

✓ Results exported to eda_results.json

============================================================
ALL PROCESSES COMPLETED SUCCESSFULLY
============================================================

Generated Files:
  1. eda_results.json - Complete analysis results
  2. eda_summary_report.html - Visual HTML report
  3. evaluation_report.txt - Performance metrics
  4. Deployment files (app.py, Dockerfile, etc.)

============================================================
In [ ]:
with open('DEPLOYMENT_GUIDE.md', 'r') as f:
    guide_content = f.read()
print(guide_content)